1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27 package org.smartcrawler.extractor;
28
29 import org.apache.log4j.Logger;
30 import org.smartcrawler.common.Link;
31 import org.smartcrawler.common.MalformedLinkException;
32 import org.smartcrawler.common.SCLogger;
33
34 /***
35 *
36 *
37 * @author <a href="mailto:pozzad@alice.it">Davide Pozza</a>
38 * @version <tt>$Revision: 1.7 $</tt>
39 */
40 public class LinkBuilderImpl implements LinkBuilder {
41
42
43 private static Logger log = SCLogger.getLogger(LinkBuilderImpl.class);
44 private static Logger logLink = SCLogger.getLinkLogger();
45
46 private Link parsedPageLink;
47 private String parsedPagePath;
48 private String hostName;
49
50 /***
51 * Creates a new instance of LinkBuilder
52 * @param parsedPageLink
53 */
54 public LinkBuilderImpl(Link parsedPageLink) {
55 this.parsedPageLink = parsedPageLink;
56 try {
57 hostName = parsedPageLink.getHost();
58 log.debug("LinkBuilderImpl(): hostName="+hostName);
59 parsedPagePath = parsedPageLink.getPath(false);
60
61 } catch (Exception e){
62 hostName = null;
63 log.debug("LinkBuilderImpl(): Invalid link " + parsedPageLink);
64 }
65 }
66
67 /***
68 *
69 * @param htmlURL
70 * @return
71 */
72 public Link buildLink(HtmlURL htmlURL) throws MalformedLinkException {
73
74 log.debug("buildLink(): BEGIN");
75
76 String extractedURL = htmlURL.getCleanedLinkAsString();
77
78 Link res = null;
79
80 log.debug("buildLink(): normalizing: " + extractedURL
81 + " of type " + HtmlURL.LINK_ABSOLUTE_URI);
82
83 String tmpLinkStr = "";
84
85
86 if(!htmlURL.isValid()) {
87 log.debug("buildLink(): Invalid link " + extractedURL);
88 return null;
89
90 }else if (htmlURL.getType() == HtmlURL.LINK_ABSOLUTE_URI) {
91
92
93
94 tmpLinkStr = HtmlURL.PROTOCOL_PREF +
95 hostName + extractedURL;
96
97 } else if (htmlURL.getType() == HtmlURL.LINK_ABSOLUTE_URL) {
98
99
100
101 tmpLinkStr = extractedURL;
102
103 } else if (htmlURL.getType() == HtmlURL.LINK_RELATIVE) {
104
105
106
107
108 tmpLinkStr = HtmlURL.PROTOCOL_PREF + hostName;
109 String tmpExtractedURL = extractedURL;
110 String newLinkPath = parsedPagePath;
111
112
113
114
115
116
117
118 if (!tmpExtractedURL.startsWith("../")) {
119 tmpLinkStr += newLinkPath + HtmlURL.PATH_SEP + tmpExtractedURL;
120
121 } else {
122
123 while(tmpExtractedURL.startsWith("../")) {
124
125
126
127
128
129
130
131
132
133
134
135
136 if (newLinkPath.length() > 0) {
137 int idx = newLinkPath.lastIndexOf(HtmlURL.PATH_SEP);
138 if (idx >= 0) newLinkPath = newLinkPath.substring(0, idx);
139 }
140 tmpExtractedURL = tmpExtractedURL.substring(3);
141 tmpLinkStr += HtmlURL.PATH_SEP + newLinkPath;
142
143
144
145
146
147
148
149
150
151
152 }
153
154 tmpLinkStr += tmpExtractedURL;
155 }
156
157 } else {
158 log.warn("buildLink(): url " + extractedURL + " UNHANDLED!! ");
159 }
160 res = new Link(tmpLinkStr);
161
162 logLink.info(parsedPageLink + " " + parsedPagePath
163 + " " + extractedURL + " " + res);
164
165 log.debug("buildLink(): curr. level: " + parsedPageLink +
166 " orig. link: " + extractedURL +
167 "; normalized: " + res);
168
169 log.debug("buildLink(): END");
170 return res;
171 }
172
173 }